Getting Started

Getting the Predictors Right

First, we load the TCGA data to have the name of the predictors (tf and miR medoids) and look for them in the CCLE data.

rm(list=ls())
source("00-paths.R")



load(file.path(paths$clean, 'xena-tcga.Rda'))
load(file.path(paths$scratch, paste('medoids_tf_all_tpm.Rda', sep = '')))
load(file.path(paths$scratch, paste('medoids_mir_all_tpm.Rda', sep = '')))

dim(mirMedoid)
## [1] 10013    28
dim(tfMedoid)
## [1] 10013    19
load(file.path(paths$clean, 'ccle.Rda'))
mir_name_cleaner <- function(name){(sub("-(3p|5p)$", "", name))}
tf_name_cleaner <- function(name){(sub("\\..*$", "", name))}
  
find_tcga_medoid_in_ccle <- function(tcga_medoid_names, ccle_feature_names, name_cleaner, d2MeanList, tcga_data, ccle_data){
  
  matched_names <- vector("character", length(tcga_medoid_names))
  for (i in seq_along(tcga_medoid_names)) {
    name <- tcga_medoid_names[i]
    d2Mean <- d2MeanList[[i]]
    d2Mean_sorted <- sort(d2Mean)
    trial <- 1
    
    while(TRUE) {
      trial <- trial + 1
      hist(as.numeric(tcga_data[name,]), main=paste('tcga', name), breaks = 123)
          
      if (name %in% ccle_feature_names) {
        matched_names[i] <- name
        break
      }
      
      name_clean <- name_cleaner(name)
      # Sometimes mirA-3p has matched to mirA and now mirA-5p is considered
      # We want to avoid mirA-5p going to mirA again, so we skip it.
      if (name_clean %in% ccle_feature_names && !(name_clean %in% matched_names)) {
        matched_names[i] <- name_clean
        break
      }
      next_smallest_value <- d2Mean_sorted[trial]
      name <- names(d2Mean)[which(d2Mean == next_smallest_value)]
    }
    hist(as.numeric(ccle_data[matched_names[i], ]), main=paste('ccle', matched_names[i]), breaks = 123)
    # print(paste("Converted", name, "in trial", trial - 1, "out of", length(d2Mean), "to", matched_names[i]))
  }
  return(matched_names)
  
}

load(file.path(paths$scratch, paste('d2MeanList_mir_all_tpm.Rda', sep = '')))
print("Converting microRNA names:")
## [1] "Converting microRNA names:"
ccle_mir_medoid_names <- find_tcga_medoid_in_ccle(colnames(mirMedoid), rownames(ccle$mir), mir_name_cleaner, d2MeanList, tcga$mir, ccle$mir)

# length(unique(ccle_mir_medoid_names))
print("CCLE miR medoid names:")
## [1] "CCLE miR medoid names:"
ccle_mir_medoid_names
##  [1] "hsa-miR-339-3p"  "hsa-miR-10b"     "hsa-miR-145"     "hsa-miR-216a"   
##  [5] "hsa-miR-487a"    "hsa-miR-200a"    "hsa-miR-340"     "hsa-miR-95"     
##  [9] "hsa-miR-34b"     "hsa-miR-185"     "hsa-miR-153"     "hsa-miR-193b"   
## [13] "hsa-miR-27b"     "hsa-let-7e"      "hsa-miR-1537"    "hsa-miR-374b"   
## [17] "hsa-miR-199b-5p" "hsa-miR-362-5p"  "hsa-miR-29c"     "hsa-miR-487b"   
## [21] "hsa-miR-769-5p"  "hsa-miR-1976"    "hsa-miR-525-5p"  "hsa-miR-506"    
## [25] "hsa-miR-28-5p"   "hsa-miR-590-5p"  "hsa-miR-139-5p"  "hsa-miR-671-3p"
load(file.path(paths$scratch, paste('d2MeanList_tf_all_tpm.Rda', sep = '')))
cleaned_ccle_tf_names <- sapply(rownames(ccle$tf), tf_name_cleaner)
print("Converting TF names:")
## [1] "Converting TF names:"
rownames(ccle$tf) <- cleaned_ccle_tf_names
ccle_tf_medoid_names <- find_tcga_medoid_in_ccle(colnames(tfMedoid), cleaned_ccle_tf_names, tf_name_cleaner, d2MeanList, tcga$tf, ccle$tf)

# length(unique(ccle_tf_medoid_names))
print("CCLE TF medoid names:")
## [1] "CCLE TF medoid names:"
ccle_tf_medoid_names
##  [1] "ENSG00000109381" "ENSG00000116833" "ENSG00000213523" "ENSG00000136535"
##  [5] "ENSG00000177374" "ENSG00000143178" "ENSG00000174576" "ENSG00000111206"
##  [9] "ENSG00000187140" "ENSG00000185129" "ENSG00000124160" "ENSG00000109320"
## [13] "ENSG00000149136" "ENSG00000182742" "ENSG00000139515" "ENSG00000162772"
## [17] "ENSG00000185811" "ENSG00000177551" "ENSG00000167034"
# rm(tfMedoid, mirMedoid)

Now we can select medoids from CCLE data:

ccle_tf_medoid <- t(as.matrix(ccle$tf[ccle_tf_medoid_names, ]))

ccle_mir_medoid <- t(as.matrix(ccle$mir[ccle_mir_medoid_names, ]))

Getting the Outcomes Right

Need to know the name of outcomes (TCGA’s mRNA) to know what genes to predict in CCLE.

load(file.path(paths$clean, 'xena-tcga.Rda'))
tcga_mrna_names <- sapply(rownames(tcga$mrna), tf_name_cleaner)
ccle_mrna_names <- sapply(rownames(ccle$mrna), tf_name_cleaner)

sum(ccle_mrna_names %in% tcga_mrna_names)
## [1] 18706
length(ccle_mrna_names)
## [1] 18706
rm(tcga, cohorts, mRNARest)

It seems that we can predict all of the CCLE’s genes!

library(ClassComparison)
## Loading required package: oompaBase
f <- file.path(paths$scratch, "Y_ccle_tpm.Rda")
if (file.exists(f)) {
  load(f)
} else {
  # common ordering of names for future
  Ynames <- intersect(tcga_mrna_names, ccle_mrna_names)
  Y_ccle <- as.matrix(ccle$mrna[Ynames, ])
  save(Y_ccle, Ynames, file = f)
}

length(Ynames)
## [1] 18706
dim(Y_ccle)
## [1] 18706   942

Testing Transportability using the Tissue-Aware model with (TF, miR)

Load the learned model from TCGA:

computeCor <- function(res, Y, ns=10^4){
  res <- as.matrix(res)
  Y <- as.matrix(Y)
  Yhat <- Y - res
  M <- nrow(Y)
  Ours <- sapply(1:M, function(i) cor(Y[i,], Yhat[i,]))
  
  Yind <- sample(1:M, ns, replace = T)
  YhatInd <- sample(1:M, ns, replace = T)
  ind <- cbind(Yind, YhatInd)
  ind <- ind[Yind != YhatInd, ]
  Null <- sapply(1:nrow(ind), function(j) cor(Y[ind[j,1],], Yhat[ind[j,2],]))
  z <- quantile(Null, .95, na.rm = T)
  acc <- sum(Ours > z, na.rm = T) / M
  return(list(ours=Ours, null=Null, acc=acc))
}

load(file.path(paths$scratch, paste('final_model_tpm_', 15, '.Rda', sep = '')))
f <- file.path(paths$scratch, paste('ccle_test_tpm_', model$nc, '.Rda', sep = ''))

if(!file.exists(f)){
  cleaned_mrna_names <- sapply(rownames(model$alpha), tf_name_cleaner)
  rownames(model$alpha) <- cleaned_mrna_names
  Y <- sweep(Y_ccle, 1, model$alpha[Ynames,]) #removing global model intercept
  X <- cbind(ccle_tf_medoid, ccle_mir_medoid)
  
  centers <- model$centers
  
  #what if we only use TFs to find the closest centers?
  cat(paste("\n  Testing with ", nrow(centers), " clusters:"))
  M <- nrow(Y); N <- ncol(Y); #nrow(X) == N
  D <- matrix(0, nrow = N, ncol = nrow(centers))
  for(i in 1:nrow(centers)){
    center <- centers[i, ]
    D[,i] <- apply(X, 1, function(x) sum((x - center)^2))
  }
  C <- apply(D, 1, which.min)
  
  res <- matrix(0, nrow = M, ncol = N)
  aX <- cbind(rep(1, nrow(X)), X) #augmented X
  for(i in 1:nrow(centers)){
    print(paste("Cluster", i, "has size", sum(C==i)))
    if(sum(C==i) == 0) next;
    Beta <- model$coeffMatList[[i]]
    colnames(Beta) <- cleaned_mrna_names
    res[, C==i] <- Y[,C==i] - t(aX[C==i, ] %*% Beta[,Ynames])
  }
  rmse <- sqrt((1/N) * apply(res ^ 2, 1, sum))
  cat(paste("\n    Averge RMSE:", round(mean(rmse),2)))
  corr <- computeCor(res, Y_ccle)
  # corr <- computeCor(res, Y)
  cat(paste("\n    Averge Cor:", round((corr$acc),2)))
  ccle_results <- list(res=res, rmse=rmse, corr=corr)
  save(ccle_results, file = f)
} else{
  load(f)
}

cat(paste("\n    Averge RMSE:", round(mean(ccle_results$rmse),2)))
## 
##     Averge RMSE: 3.83
cat(paste("\n    Averge Cor:", round((ccle_results$corr$acc),2)))
## 
##     Averge Cor: 0.23
compute_cor_acc <- function(residuals, y, ns = 10^4){
  set.seed(12345)
  yhat <- y - residuals
  M <- nrow(y)
  Ours <- sapply(1:M, function(i) cor(y[i,], yhat[i,]))
  cat("\n Ours info:\n")
  cat(summary(Ours))
  
  Yind <- sample(1:M, ns, replace = T)
  YhatInd <- sample(1:M, ns, replace = T)
  ind <- cbind(Yind, YhatInd)
  ind <- ind[Yind != YhatInd, ]
  Null <- sapply(1:nrow(ind), function(j) cor(y[ind[j,1],], yhat[ind[j,2],]))
  
  # Null <- cor(t(y[ind[,1],]), t(yhat[ind[,2],])) #cor is column-wise
  
  
  z <- quantile(Null, .95, na.rm = T)
  acc <- sum(Ours > z, na.rm = T) / M
  cat("\n Null info:\n")
  cat(summary(Null))
  cat("\n Cut-off:", z)
  # Load necessary library
  
  # Create separate data frames
  df_ours <- data.frame(value = Ours, category = "Ours")
  df_null <- data.frame(value = Null, category = "Null")
  
  # Combine data frames
  df <- rbind(df_ours, df_null)
  
  # Plot the histograms
  library(ggplot2)
  p <- ggplot(df, aes(x = value, fill = category)) +
    geom_histogram(aes(y = ..density..), position = "identity", alpha = 0.5, bins = 30) +
    labs(title = "Histogram of Ours and Null", 
         x = "Value", 
         y = "Frequency") +
    scale_fill_manual(values = c("blue", "red")) +
    theme_minimal()
  
  print(p)
  
  return(list(ours=Ours, null=Null, acc=acc))
}

null_ours_hist_ccle <- compute_cor_acc(ccle_results$res, Y_ccle, 10000)
## 
##  Ours info:
## -0.4380463 0.0238439 0.1109461 0.1176306 0.2086233 0.83386 26
##  Null info:
## -0.5201443 -0.05288251 0.02021679 0.01989792 0.09473658 0.6242668 19
##  Cut-off: 0.2194433

The result is not very good. Let’s double check the features:

par(mfrow=c(1,2))
for(p in 1:19){
  hist(tfMedoid[tfMedoid[, p] >= -9, p], main=colnames(tfMedoid)[p], xlab = "In TCGA", freq = T)
  hist(ccle_tf_medoid[,p], main=colnames(ccle_tf_medoid)[p], xlab = "In CCLE", freq = T)
}

for(p in 1:28){
  hist(mirMedoid[, p], main=colnames(mirMedoid)[p], xlab = "In TCGA", freq = T)
  hist(ccle_mir_medoid[,p], main=colnames(ccle_mir_medoid)[p], xlab = "In CCLE", freq = T)
}

par(mfrow=c(1,1))

Thankfully, TF features are mostly similar but with some extra zeros in TCGA data. For miRs the situation is more complicated. Upon visual inspection, many of the matched microRNAs do not have similar distributions. Worst than that, in some cases matched microRNAs in one database has almost zero distribution which can not be fixed with any method. This shows that it is going to be very difficult to have transportability with microRNAs.

# First, ensure you have the necessary library
library(transport)
library(viridis)
## Loading required package: viridisLite
# Function to calculate fast 1D Wasserstein distance
fast_wasserstein_1d <- function(P_samples, Q_samples) {
  P_sorted <- sort(P_samples)
  Q_sorted <- sort(Q_samples)
  return(mean(abs(P_sorted - Q_sorted)))
}

# Update your pairwise_wasserstein function
pairwise_wasserstein_fast <- function(data1, data2) {
  n <- ncol(data1)
  m <- matrix(0, n, n)
  for (i in 1:n) {
    for (j in 1:n) {
      P_samples <- data1[data1[, i]>-9, i]
      Q_samples <- data2[data2[, i]>-9, j]
      
      # Use fast_wasserstein_1d function
      wasserstein_dist <- fast_wasserstein_1d(P_samples, Q_samples)
      
      # Convert to distance
      m[i, j] <- wasserstein_dist
    }
  }
  return(m)
}


# Compute distance matrices
tf_distance <- pairwise_wasserstein_fast(tfMedoid, ccle_tf_medoid)
mir_distance <- pairwise_wasserstein_fast(mirMedoid, ccle_mir_medoid)

Now plot them:

library(ggplot2)
library(gridExtra)

# Plotting function
plot_density <- function(distance_matrix, diag_name, off_diag_name, plot_title) {
  
  # Create separate data frames
  df_ours <- data.frame(value = diag(distance_matrix), category = diag_name)
  df_null <- data.frame(value = c(distance_matrix[upper.tri(distance_matrix)], distance_matrix[lower.tri(distance_matrix)]), category = off_diag_name)
  
  # Combine data frames
  df <- rbind(df_ours, df_null)
  
  # Plot the density plots
  p <- ggplot(df, aes(x = value, fill = category)) +
    geom_density(aes(y = ..density..), alpha = 0.5) +
    labs(title = plot_title, 
         x = "Distance", 
         y = "Density") +
    scale_fill_manual(values = c("blue", "red"), name = "") +   # Removing the 'category' label from legend
    theme_minimal() +
    theme(legend.position=c(0.95, 0.95), legend.justification=c(1, 1)) +
    xlim(c(0, NA))   # Setting the x-axis limit starting from zero
  
  return(p)
}

# Call the function for tf_distance
p_tf <- plot_density(tf_distance, 
                     "Corresponding Pairs", 
                     "Non-corresponding Pairs (Null)", 
                     "Wasserstein Distances between TF Distributions: TCGA vs. CCLE")

# Call the function for mir_distance
p_mir <- plot_density(mir_distance, 
                      "Corresponding Pairs", 
                      "Non-corresponding Pairs (Null)", 
                      "Wasserstein Distances between miRNA Distributions: TCGA vs. CCLE")

# Combine the plots using grid.arrange and save the result as an object
combined_plot <- grid.arrange(p_tf, p_mir, ncol = 1, nrow = 2)

# Save the combined plot to a PNG
ggsave(filename =  file.path(paths$figures, "combined_plot.png"), plot = combined_plot, width = 7, height = 8, units = "in", dpi = 300)

Examples:

top_mirs <- order(diag(mir_distance), decreasing = TRUE)[1:3]

# Define the file name and start the PNG device
# png(filename = file.path(paths$figures,"miR_histograms.png"), width = 800, height = 800)

# Your existing plotting code with enlarged text elements
par(mfrow = c(3, 2), mar = c(5, 5, 4, 2))  # Adjusted left margin


for (mir in top_mirs) {
  hist(mirMedoid[,mir], 
       main = paste("Histogram of miR", colnames(mirMedoid)[mir], "in TCGA"), 
       col = "blue", 
       xlim = range(c(mirMedoid[,mir], ccle_mir_medoid[,mir])), 
       xlab = "Expression", 
       breaks = 30, 
       cex.main = 1.5,  # Enlarge main title
       cex.lab = 1.75,  # Enlarge x and y labels
       cex.axis = 1.2)  # Enlarge axis tick labels
  
  hist(ccle_mir_medoid[,mir], 
       main = paste("Histogram of miR", colnames(ccle_mir_medoid)[mir], "in CCLE"), 
       col = "red", 
       xlim = range(c(mirMedoid[,mir], ccle_mir_medoid[,mir])), 
       xlab = "Expression", 
       breaks = 30, 
       cex.main = 1.5,  # Enlarge main title
       cex.lab = 1.75,  # Enlarge x and y labels
       cex.axis = 1.2)  # Enlarge axis tick labels
}

# Close the PNG device
# dev.off()



# Identifying the three TFs with lowest distances
# top_tfs <- order(diag(tf_distance), decreasing = FALSE)[1:3]

# 
# for (tf in top_tfs) {
#   hist(tfMedoid[,tf], main = paste("Histogram of TF", colnames(tfMedoid)[tf], "in TCGA"), col = "blue", xlim = range(c(tfMedoid[,tf], ccle_tf_medoid[,tf])), breaks = 30)
#   hist(ccle_tf_medoid[,tf], main = paste("Histogram of TF", colnames(ccle_tf_medoid)[tf], "in CCLE"), col = "red", xlim = range(c(tfMedoid[,tf], ccle_tf_medoid[,tf])), breaks = 30)
# }

Appendix

This analysis was performed using the following R packages.

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gridExtra_2.3         viridis_0.6.2         viridisLite_0.4.0    
## [4] transport_0.14-6      ggplot2_3.4.0         ClassComparison_3.1.8
## [7] oompaBase_3.2.9       rjson_0.2.20         
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.1    xfun_0.35           bslib_0.4.0        
##  [4] purrr_0.3.4         splines_4.1.1       colorspace_2.0-2   
##  [7] vctrs_0.5.1         generics_0.1.1      htmltools_0.5.4    
## [10] yaml_2.2.1          utf8_1.2.2          rlang_1.0.6        
## [13] jquerylib_0.1.4     pillar_1.8.1        glue_1.6.2         
## [16] withr_2.5.0         DBI_1.1.1           BiocGenerics_0.38.0
## [19] lifecycle_1.0.3     stringr_1.4.0       munsell_0.5.0      
## [22] gtable_0.3.0        ragg_1.2.5          evaluate_0.19      
## [25] labeling_0.4.2      Biobase_2.52.0      knitr_1.41         
## [28] fastmap_1.1.0       parallel_4.1.1      fansi_0.5.0        
## [31] highr_0.9           Rcpp_1.0.7          scales_1.2.1       
## [34] cachem_1.0.6        jsonlite_1.7.2      systemfonts_1.0.3  
## [37] farver_2.1.0        textshaping_0.3.6   digest_0.6.28      
## [40] stringi_1.7.5       dplyr_1.0.10        grid_4.1.1         
## [43] cli_3.4.1           tools_4.1.1         magrittr_2.0.3     
## [46] sass_0.4.2          tibble_3.1.8        cluster_2.1.2      
## [49] pkgconfig_2.0.3     data.table_1.14.0   assertthat_0.2.1   
## [52] rmarkdown_2.19      rstudioapi_0.13     R6_2.5.1           
## [55] compiler_4.1.1